import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current sessio
ruta = r'C:\MachineLearning\Dataset\TornadosDataset\tornados.csv.zip' # Ruta corregida con r para raw string
df = pd.read_csv(ruta) # Solo cargamos el archivo sin ningún procesamiento adicional
df.head(10)
| om | yr | mo | dy | date | time | tz | datetime_utc | st | stf | ... | elon | len | wid | ns | sn | f1 | f2 | f3 | f4 | fc | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 192 | 1950 | 10 | 1 | 1950-10-01 | 21:00:00 | America/Chicago | 1950-10-02T03:00:00Z | OK | 40 | ... | -102.30 | 15.8 | 10 | 1 | 1 | 25 | 0 | 0 | 0 | False |
| 1 | 193 | 1950 | 10 | 9 | 1950-10-09 | 02:15:00 | America/Chicago | 1950-10-09T08:15:00Z | NC | 37 | ... | 0.00 | 2.0 | 880 | 1 | 1 | 47 | 0 | 0 | 0 | False |
| 2 | 195 | 1950 | 11 | 20 | 1950-11-20 | 02:20:00 | America/Chicago | 1950-11-20T08:20:00Z | KY | 21 | ... | 0.00 | 0.1 | 10 | 1 | 1 | 177 | 0 | 0 | 0 | False |
| 3 | 196 | 1950 | 11 | 20 | 1950-11-20 | 04:00:00 | America/Chicago | 1950-11-20T10:00:00Z | KY | 21 | ... | 0.00 | 0.1 | 10 | 1 | 1 | 209 | 0 | 0 | 0 | False |
| 4 | 197 | 1950 | 11 | 20 | 1950-11-20 | 07:30:00 | America/Chicago | 1950-11-20T13:30:00Z | MS | 28 | ... | 0.00 | 2.0 | 37 | 1 | 1 | 101 | 0 | 0 | 0 | False |
| 5 | 194 | 1950 | 11 | 4 | 1950-11-04 | 17:00:00 | America/Chicago | 1950-11-04T23:00:00Z | PA | 42 | ... | -75.93 | 15.9 | 100 | 1 | 1 | 71 | 11 | 0 | 0 | False |
| 6 | 198 | 1950 | 12 | 2 | 1950-12-02 | 15:00:00 | America/Chicago | 1950-12-02T21:00:00Z | IL | 17 | ... | -89.72 | 18.8 | 50 | 1 | 1 | 119 | 117 | 0 | 0 | False |
| 7 | 199 | 1950 | 12 | 2 | 1950-12-02 | 16:00:00 | America/Chicago | 1950-12-02T22:00:00Z | IL | 17 | ... | -89.38 | 18.0 | 200 | 1 | 1 | 119 | 5 | 0 | 0 | False |
| 8 | 200 | 1950 | 12 | 2 | 1950-12-02 | 16:25:00 | America/Chicago | 1950-12-02T22:25:00Z | AR | 5 | ... | -91.72 | 7.8 | 10 | 1 | 1 | 65 | 0 | 0 | 0 | False |
| 9 | 201 | 1950 | 12 | 2 | 1950-12-02 | 17:30:00 | America/Chicago | 1950-12-02T23:30:00Z | IL | 17 | ... | -89.62 | 9.6 | 50 | 1 | 1 | 157 | 0 | 0 | 0 | False |
10 rows × 27 columns
df.shape
(68693, 27)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68693 entries, 0 to 68692
Data columns (total 27 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 om 68693 non-null int64
1 yr 68693 non-null int64
2 mo 68693 non-null int64
3 dy 68693 non-null int64
4 date 68693 non-null object
5 time 68693 non-null object
6 tz 68693 non-null object
7 datetime_utc 68693 non-null object
8 st 68693 non-null object
9 stf 68693 non-null int64
10 mag 67937 non-null float64
11 inj 68693 non-null int64
12 fat 68693 non-null int64
13 loss 41523 non-null float64
14 slat 68693 non-null float64
15 slon 68693 non-null float64
16 elat 68693 non-null float64
17 elon 68693 non-null float64
18 len 68693 non-null float64
19 wid 68693 non-null int64
20 ns 68693 non-null int64
21 sn 68693 non-null int64
22 f1 68693 non-null int64
23 f2 68693 non-null int64
24 f3 68693 non-null int64
25 f4 68693 non-null int64
26 fc 68693 non-null bool
dtypes: bool(1), float64(7), int64(14), object(5)
memory usage: 13.7+ MB
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| om | 68693.0 | 1.132018e+05 | 2.266220e+05 | 1.0000 | 285.00 | 588.0000 | 1118.00 | 6.220800e+05 |
| yr | 68693.0 | 1.991854e+03 | 1.956516e+01 | 1950.0000 | 1976.00 | 1995.0000 | 2008.00 | 2.022000e+03 |
| mo | 68693.0 | 5.968541e+00 | 2.444656e+00 | 1.0000 | 4.00 | 6.0000 | 7.00 | 1.200000e+01 |
| dy | 68693.0 | 1.593088e+01 | 8.750070e+00 | 1.0000 | 8.00 | 16.0000 | 24.00 | 3.100000e+01 |
| stf | 68693.0 | 2.922026e+01 | 1.501327e+01 | 1.0000 | 18.00 | 28.0000 | 42.00 | 7.800000e+01 |
| mag | 67937.0 | 7.787215e-01 | 8.957898e-01 | 0.0000 | 0.00 | 1.0000 | 1.00 | 5.000000e+00 |
| inj | 68693.0 | 1.418689e+00 | 1.811475e+01 | 0.0000 | 0.00 | 0.0000 | 0.00 | 1.740000e+03 |
| fat | 68693.0 | 8.931041e-02 | 1.472120e+00 | 0.0000 | 0.00 | 0.0000 | 0.00 | 1.580000e+02 |
| loss | 41523.0 | 2.020898e+06 | 3.039588e+07 | 50.0000 | 10000.00 | 50000.0000 | 500000.00 | 2.800100e+09 |
| slat | 68693.0 | 3.712939e+01 | 5.099005e+00 | 17.7212 | 33.18 | 37.0000 | 40.92 | 6.102000e+01 |
| slon | 68693.0 | -9.276149e+01 | 8.672112e+00 | -163.5300 | -98.42 | -93.5552 | -86.73 | -6.471510e+01 |
| elat | 68693.0 | 2.296065e+01 | 1.852814e+01 | 0.0000 | 0.00 | 32.5500 | 38.65 | 6.102000e+01 |
| elon | 68693.0 | -5.683609e+01 | 4.534073e+01 | -163.5300 | -94.78 | -84.7200 | 0.00 | 0.000000e+00 |
| len | 68693.0 | 3.489270e+00 | 8.247115e+00 | 0.0000 | 0.12 | 0.8000 | 3.21 | 2.347000e+02 |
| wid | 68693.0 | 1.077676e+02 | 2.068513e+02 | 0.0000 | 20.00 | 50.0000 | 100.00 | 4.576000e+03 |
| ns | 68693.0 | 1.008764e+00 | 9.505967e-02 | 1.0000 | 1.00 | 1.0000 | 1.00 | 3.000000e+00 |
| sn | 68693.0 | 9.914111e-01 | 9.227835e-02 | 0.0000 | 1.00 | 1.0000 | 1.00 | 1.000000e+00 |
| f1 | 68693.0 | 1.046440e+02 | 9.675030e+01 | 0.0000 | 37.00 | 85.0000 | 137.00 | 8.100000e+02 |
| f2 | 68693.0 | 8.605142e+00 | 3.810602e+01 | 0.0000 | 0.00 | 0.0000 | 0.00 | 8.200000e+02 |
| f3 | 68693.0 | 1.687202e+00 | 1.668166e+01 | 0.0000 | 0.00 | 0.0000 | 0.00 | 7.100000e+02 |
| f4 | 68693.0 | 5.067911e-01 | 9.163194e+00 | 0.0000 | 0.00 | 0.0000 | 0.00 | 5.070000e+02 |
df.isnull().sum()
om 0
yr 0
mo 0
dy 0
date 0
time 0
tz 0
datetime_utc 0
st 0
stf 0
mag 756
inj 0
fat 0
loss 27170
slat 0
slon 0
elat 0
elon 0
len 0
wid 0
ns 0
sn 0
f1 0
f2 0
f3 0
f4 0
fc 0
dtype: int64
# Drop rows with missing values
df.dropna(inplace=True)
df['date'] = pd.to_datetime(df['date'])
# Calculate the number of days since a reference date (e.g., the minimum date in the column)
df['date_numeric'] = (df['date'] - df['date'].min()).dt.days
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S')
# Extract the time in seconds since midnight and store it in a new column 'time_numeric'
df['time_numeric'] = df['time'].dt.hour * 3600 + df['time'].dt.minute * 60 + df['time'].dt.second
# List of numeric column names
numeric_columns = ['om', 'yr', 'mo', 'dy', 'stf', 'mag', 'inj', 'fat', 'loss', 'slat', 'slon', 'elat', 'elon', 'len', 'wid', 'ns', 'sn', 'f1', 'f2', 'f3', 'f4']
# Create a DataFrame with only the numeric columns
numeric_df = df[numeric_columns]
# Calculate the correlation matrix
correlation_matrix = numeric_df.corr()
# Print or use the correlation_matrix as needed
print(correlation_matrix)
om yr mo dy stf mag inj \
om 1.000000 0.641333 -0.014129 0.038291 0.017673 -0.123721 -0.018815
yr 0.641333 1.000000 0.011081 0.020438 -0.011882 -0.270327 -0.035231
mo -0.014129 0.011081 1.000000 -0.022037 0.017126 -0.053477 -0.023666
dy 0.038291 0.020438 -0.022037 1.000000 0.007341 -0.010681 -0.009103
stf 0.017673 -0.011882 0.017126 0.007341 1.000000 0.005377 -0.011306
mag -0.123721 -0.270327 -0.053477 -0.010681 0.005377 1.000000 0.230651
inj -0.018815 -0.035231 -0.023666 -0.009103 -0.011306 0.230651 1.000000
fat -0.007449 -0.027722 -0.025170 -0.003246 -0.009078 0.207488 0.757170
loss 0.011247 0.019826 -0.009462 -0.001654 -0.005006 0.147667 0.528723
slat -0.036809 -0.070104 0.057358 -0.002550 0.153543 0.065990 -0.003999
slon 0.076815 0.108611 0.038201 -0.010476 -0.118155 -0.030855 0.017725
elat 0.352794 0.512345 -0.021748 0.008169 0.012422 0.135053 0.058227
elon -0.356862 -0.523312 0.036354 -0.011956 0.001557 -0.127254 -0.055493
len 0.006556 -0.037387 -0.054607 -0.002215 -0.027625 0.415129 0.252756
wid 0.166654 0.151644 -0.068346 0.014518 0.001584 0.384436 0.193788
ns 0.041793 0.039080 -0.014718 0.004426 -0.007908 0.124138 0.115227
sn -0.041965 -0.039049 0.015142 -0.005649 0.007918 -0.124903 -0.117668
f1 0.010428 0.001086 -0.036121 0.002581 0.233779 -0.004434 -0.014978
f2 0.045533 0.031565 -0.036306 -0.003671 0.017878 0.203320 0.074652
f3 0.009687 -0.012851 -0.029028 0.005268 -0.010504 0.157598 0.096212
f4 -0.002056 -0.019453 -0.024600 0.002862 -0.006223 0.109177 0.074728
fat loss slat ... elat elon len \
om -0.007449 0.011247 -0.036809 ... 0.352794 -0.356862 0.006556
yr -0.027722 0.019826 -0.070104 ... 0.512345 -0.523312 -0.037387
mo -0.025170 -0.009462 0.057358 ... -0.021748 0.036354 -0.054607
dy -0.003246 -0.001654 -0.002550 ... 0.008169 -0.011956 -0.002215
stf -0.009078 -0.005006 0.153543 ... 0.012422 0.001557 -0.027625
mag 0.207488 0.147667 0.065990 ... 0.135053 -0.127254 0.415129
inj 0.757170 0.528723 -0.003999 ... 0.058227 -0.055493 0.252756
fat 1.000000 0.471312 -0.006840 ... 0.049496 -0.048960 0.235652
loss 0.471312 1.000000 -0.000465 ... 0.043983 -0.043704 0.154692
slat -0.006840 -0.000465 1.000000 ... 0.134238 0.003470 0.027393
slon 0.009747 0.004238 -0.123253 ... 0.057484 0.027445 -0.014806
elat 0.049496 0.043983 0.134238 ... 1.000000 -0.973359 0.344643
elon -0.048960 -0.043704 0.003470 ... -0.973359 1.000000 -0.341728
len 0.235652 0.154692 0.027393 ... 0.344643 -0.341728 1.000000
wid 0.184129 0.183187 -0.005657 ... 0.265341 -0.273972 0.361725
ns 0.117815 0.049873 0.013577 ... 0.095309 -0.089331 0.263836
sn -0.119968 -0.051055 -0.013804 ... -0.096143 0.090145 -0.265096
f1 -0.010800 -0.004242 -0.190463 ... -0.028823 -0.009024 -0.031833
f2 0.052892 0.057888 -0.008680 ... 0.200617 -0.205818 0.360852
f3 0.080189 0.079589 -0.006792 ... 0.096825 -0.097735 0.413864
f4 0.059851 0.068563 -0.002527 ... 0.055573 -0.055220 0.383734
wid ns sn f1 f2 f3 f4
om 0.166654 0.041793 -0.041965 0.010428 0.045533 0.009687 -0.002056
yr 0.151644 0.039080 -0.039049 0.001086 0.031565 -0.012851 -0.019453
mo -0.068346 -0.014718 0.015142 -0.036121 -0.036306 -0.029028 -0.024600
dy 0.014518 0.004426 -0.005649 0.002581 -0.003671 0.005268 0.002862
stf 0.001584 -0.007908 0.007918 0.233779 0.017878 -0.010504 -0.006223
mag 0.384436 0.124138 -0.124903 -0.004434 0.203320 0.157598 0.109177
inj 0.193788 0.115227 -0.117668 -0.014978 0.074652 0.096212 0.074728
fat 0.184129 0.117815 -0.119968 -0.010800 0.052892 0.080189 0.059851
loss 0.183187 0.049873 -0.051055 -0.004242 0.057888 0.079589 0.068563
slat -0.005657 0.013577 -0.013804 -0.190463 -0.008680 -0.006792 -0.002527
slon -0.015333 0.014608 -0.014400 -0.111574 -0.002058 0.002593 -0.001872
elat 0.265341 0.095309 -0.096143 -0.028823 0.200617 0.096825 0.055573
elon -0.273972 -0.089331 0.090145 -0.009024 -0.205818 -0.097735 -0.055220
len 0.361725 0.263836 -0.265096 -0.031833 0.360852 0.413864 0.383734
wid 1.000000 0.145225 -0.146145 0.004304 0.182506 0.152638 0.115759
ns 0.145225 1.000000 -0.991692 -0.120844 -0.022475 -0.009025 -0.006423
sn -0.146145 -0.991692 1.000000 0.121857 0.023013 0.009024 0.006458
f1 0.004304 -0.120844 0.121857 1.000000 0.081928 0.038337 0.024276
f2 0.182506 -0.022475 0.023013 0.081928 1.000000 0.325328 0.170388
f3 0.152638 -0.009025 0.009024 0.038337 0.325328 1.000000 0.434265
f4 0.115759 -0.006423 0.006458 0.024276 0.170388 0.434265 1.000000
[21 rows x 21 columns]
Análisis Exploratorio de Datos#
sns.heatmap(correlation_matrix,fmt='.2f',cmap='Blues')
plt.show
sns.clustermap(correlation_matrix,fmt='.2f',cmap='Blues')
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
Análisis Univariado#
# Scatter plot for two numeric columns (e.g., 'mag' vs. 'loss')
plt.scatter(df['mag'], df['loss'])
plt.xlabel('Magnitude')
plt.ylabel('Loss')
plt.title('Scatter Plot: Magnitude vs. Loss')
plt.show()
# Box plot to visualize the distribution of a numeric column by a categorical column (e.g., 'st' vs. 'mag')
plt.figure(figsize=(10, 7))
sns.boxplot(data=df, x='st', y='mag')
plt.xlabel('State')
plt.ylabel('Magnitude')
plt.title('Box Plot: Magnitude by State')
plt.xticks(rotation=90)
plt.show()
# Histogram for a numeric column (e.g., 'mag')
plt.hist(df['mag'], bins=20)
plt.xlabel('Magnitude')
plt.ylabel('Frequency')
plt.title('Histogram of Magnitude')
plt.show()
# Bar plot for a categorical column (e.g., 'st')
plt.figure(figsize=(10, 7))
sns.countplot(data=df, x='st')
plt.xlabel('State')
plt.ylabel('Count')
plt.title('Counts by State')
plt.xticks(rotation=90)
plt.show()
fig = px.pie(df, names='st', title='Distribution of States')
fig.show()
fig = px.box(df, x='mag', title='Distribution of Magnitude')
fig.show()
# Select a subset of numeric columns for pair plotting
subset_columns = ['mag', 'loss', 'len', 'wid', 'inj']
# Create a pair plot for selected columns
fig = px.scatter_matrix(df, dimensions=subset_columns, title='Pair Plot of Numeric Variables')
fig.show()
# count the injuries in each year and plot the highest 10
injuries_sorted_by_year = (df.groupby('yr')['loss'].sum()/ 100000000).sort_values(ascending=False)
injuries_sorted_by_year[:10].plot(kind='barh', color=sns.color_palette('Spectral'), figsize=(10, 7), xlabel='Total Loss (in 100 millions)$')
<Axes: xlabel='Total Loss (in 100 millions)$', ylabel='yr'>
fig = px.scatter_3d(df, x='mag', y='loss', z='len', color='fat', title='3D Scatter Plot: Magnitude vs. Loss vs. Length')
fig.show()
fig = px.scatter_geo(df, lat='slat', lon='slon', title='Geographic Scatter Plot')
fig.update_geos(projection_type='natural earth')
fig.show()
MODELOS#
Modelo de Regresión#
# Select features (independent variables) and the target variable
X = df[['mag', 'slat', 'slon', 'elat', 'elon', 'len', 'wid']]
y = df['loss']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a linear regression model
model = LinearRegression()
# Train the model on the training data
model.fit(X_train, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
# Make predictions on the test set
y_pred = model.predict(X_test)
Predicciones#
# Calculate the Mean Squared Error (MSE) and R-squared (R2) for evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")
Mean Squared Error: 535175920589345.81
R-squared (R2): 0.02
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Loss')
plt.ylabel('Predicted Loss')
plt.title('Actual vs. Predicted Loss')
plt.show()
from sklearn.impute import SimpleImputer
# Initialize the imputer
imputer = SimpleImputer(strategy='mean') # You can use other strategies like 'median' or 'constant'
# Fit and transform the imputer on your data
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
# Create a HistGradientBoostingRegressor model
model = HistGradientBoostingRegressor()
# Train the model on the training data
model.fit(X_train, y_train)
HistGradientBoostingRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
HistGradientBoostingRegressor()
y_pred = model.predict(X_test)
# Create a scatter plot to visualize the predictions vs. actual values
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Loss')
plt.ylabel('Predicted Loss')
plt.title('Actual vs. Predicted Loss (HistGradientBoostingRegressor)')
plt.show()